Dictionary Section

data <- read.csv("~/Desktop/Nunatfinal/NunatDictionaryText.csv") #Replace with path to csv file containing speaker/text transcriptions

library(quanteda)
#Dictionary format is Lexicoder, please see Lexicoder documentation at http://www.snsoroka.com/data-lexicoder/ for further information.

Dict <- dictionary(file = "~/Desktop/Nunatfinal/localnonlocaldic.lc3", format = "lexicoder") #Replace with path to .LC3 dictionary

tibbz <- tibble::tibble(speaker = data$speaker, text = as.character(data$text)) #Creates tibble (data frame) to contain the following data

mycorpus <- corpus(tibbz) #corups creation
locz <- dfm(mycorpus, dictionary = Dict, remove =  c(stopwords("english")), remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE) #applies the dictionary to the corpus and removes stopwords, numbers, punctuation, and symbols
toks <- tokens(mycorpus, remove_punct = TRUE) #tokenizes words (for the Bag of Words method Lexicoder employs)
toks_nostop <- tokens_select(toks, pattern = stopwords('en'), selection = 'remove') #Removes any potential remaining stopwords from tokens set


tibbz$local <- as.numeric(locz[,1]) #Creates column "local" that is a count of the number of flagged local words in each intervention
tibbz$nonlocal <- as.numeric(locz[,2]) #Does the same but for the "non-local" section that ended up not being used in the paper
tibbz$total <- ntoken(toks_nostop) #Total number of words in the intervention
tibbz$proplocal <- (tibbz$local / tibbz$total) #Proportion of local words over total words